Results using sparklyr
<<<<<<< HEAD
# source("~/Testing/Joseph/trelliscope.R")
=======
# source("C:/Users/trenton.pulsipher/Documents/R/r_prjs/tsCogs/Testing/Joseph/trelliscope.R")
>>>>>>> c552f4bc0196a5ce1c185ee8c76914868cc6321a
library(tsCogs)
library(trelliscopejs)
library(magrittr)
library(sparklyr)
<<<<<<< HEAD
rawData <- readRDS("C:/Users/trenton.pulsipher/Documents/R/r_prjs/tsCogs/R_Data/rawDailyProfilesAll-20180206.rds")
=======
#rawData <- readRDS("C:/Users/trenton.pulsipher/Documents/R/r_prjs/tsCogs/R_Data/rawDailyProfilesAll-20180206.rds")
# More generic location
rawData <- readRDS("~/R/R_prjs/tsCogs/R_Data/rawDailyProfilesAll-20180206.rds")
>>>>>>> c552f4bc0196a5ce1c185ee8c76914868cc6321a
rawData %<>%
as.tibble() %>%
rename(Date = ymd) %>%
group_by(AccountNumber) %>%
filter(!is.na(AccountNumber)) %>%
arrange(Date) %>%
mutate(Week = floor_date(Date, "week")) %>%
group_by(AccountNumber, Week) %>%
summarise(Count = sum(Count)) %>%
rename(Date = Week) %>%
mutate(meanCount = mean(Count, na.rm = T),
normCount = Count / meanCount) %>%
select(AccountNumber, Date, normCount) %>%
spread(key = Date, value = normCount)
sc <- spark_connect(master = "local") # setup spark connection
rawData_tbl <- copy_to(sc, rawData %>%
<<<<<<< HEAD
ungroup() ,#%>%
#select(-AccountNumber),
=======
ungroup(),
>>>>>>> c552f4bc0196a5ce1c185ee8c76914868cc6321a
"rawData", overwrite = TRUE)
# tic()
# set.seed(1234)
# numClusters = c(10,25,50,100,150,200,300,500,1000)
# out = list()
# for(i in 1:length(numClusters)) {
# out[[i]] = rawData_tbl %>%
# ml_kmeans(~.-AccountNumber, centers = numClusters[[i]])
# cat(numClusters[i], " ")
# }
# toc()
#
#
# qplot(x = numClusters, y = unlist(lapply(out, function(x) x$cost))) +
# geom_line() +
# labs(x = "Number of Clusters", y = "Total W/in Sums of Squares") +
# theme_bw()
mlKmeans <- rawData_tbl %>%
ml_kmeans(~.-AccountNumber, centers = 200)
<<<<<<< HEAD
bob <- ml_predict(mlKmeans, rawData_tbl) %>%
as_tibble()
bob %>%
=======
predict <- ml_predict(mlKmeans, rawData_tbl) %>%
as_tibble()
predict %>%
>>>>>>> c552f4bc0196a5ce1c185ee8c76914868cc6321a
select(-features) %>%
# filter(prediction == 70) %>%
gather("Date", "Count", `20140928`:`20180204`) %>%
mutate(Date = ymd(Date)) %>%
group_by(prediction) %>%
# filter(Count > 0) %>%
nest() %>%
mutate(
cogs = map_cog(data, ~ data_frame(
numAccts = length(unique(.$AccountNumber)),
total = sum(.$Count),
mean = mean(.$Count),
sd = sd(.$Count),
<<<<<<< HEAD
cv = (sd(.$Count) / mean(.$Count))
)),
panel = map_plot(data, ~ ggplot(., aes(x = Date, y = Count, group = AccountNumber)) +
geom_line(alpha = .1) +
=======
cv = (sd(.$Count) / mean(.$Count))#,
#total = sum(.$Count) / length(unique(.$AccountNumber))
)),
panel = map_plot(data, ~ ggplot(., aes(x = Date, y = Count, group = AccountNumber)) +
geom_line(alpha = .05) +
>>>>>>> c552f4bc0196a5ce1c185ee8c76914868cc6321a
theme_bw() +
labs(x = "", y = "normalized count")
)
) %>%
trelliscope("Cluster Results", self_contained = T)
<<<<<<< HEAD
=======
>>>>>>> c552f4bc0196a5ce1c185ee8c76914868cc6321a